Sentiment Analysis, also known as Opinion Mining, is the field within NLP that seeks to identify and extract opininons within text.
The interest is with:
In [1]:
%%bash
ls -lh | grep .csv
In [2]:
# built-in libs
import email
# processing libs
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from wordcloud import WordCloud, STOPWORDS
stopwords = set(STOPWORDS)
# display libs
from tqdm import tqdm_notebook
In [3]:
emails_full_df = pd.read_csv('emails.csv', chunksize=10000)
emails_df = next(emails_full_df)
In [4]:
print(emails_df.shape)
emails_df.head()
Out[4]:
In [5]:
emails_df.info()
In [6]:
%time
messages_obj_lst = []
messages_str_lst = []
message_metadata = {}
for i in tqdm_notebook(range(emails_df.shape[0])):
msg = email.message_from_string(emails_df.message[i])
for msg_property in msg:
if msg_property in message_metadata:
message_metadata[msg_property][i] = msg[msg_property]
else:
message_metadata[msg_property] = ['N/A'] * emails_df.shape[0]
payload = msg.get_payload() # decode=True
messages_obj_lst.append(msg)
messages_str_lst.append(payload) #.encode('utf-8').decode('unicode_escape')
#except KeyboardInterrupt:
# break
print('messages_obj_lst size: %i' % len(messages_obj_lst))
In [7]:
# update dataframe object
# emails_df.rename(columns = {'message':'message_obj'}, inplace = True)
emails_df = emails_df.assign(message_obj = pd.Series(messages_obj_lst).values)
emails_df = emails_df.assign(payload = pd.Series(messages_str_lst).values)
# print(emails_df.payload.str.contains(r'\\'))
emails_df['payload'] = emails_df.payload.str.replace(r'\n', '')
In [8]:
emails_df.head()
Out[8]:
In [9]:
del messages_obj_lst
del messages_str_lst
emails_df.drop('message', axis=1, inplace=True)
In [10]:
emails_df.head()
Out[10]:
In [11]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
In [12]:
sid = SentimentIntensityAnalyzer()
ss_lst = []
for i in tqdm_notebook(range(emails_df.shape[0])):
ss = sid.polarity_scores(emails_df.payload.iloc[i])
ss_lst.append(ss)
emails_df['sent_obj'] = ss_lst
In [13]:
emails_df.head()
Out[13]:
In [14]:
emails_df['sent_pos'] = emails_df.apply(lambda x: x.sent_obj['pos'], axis=1)
emails_df['sent_neg'] = emails_df.apply(lambda x: x.sent_obj['neg'], axis=1)
emails_df['sent_neu'] = emails_df.apply(lambda x: x.sent_obj['neu'], axis=1)
emails_df['sent_comp'] = emails_df.apply(lambda x: x.sent_obj['compound'], axis=1)
emails_df.drop('sent_obj', axis=1, inplace=True)
In [15]:
emails_df.head()
Out[15]:
In [16]:
emails_df[emails_df['sent_pos'] > 0.5].drop('message_obj', axis=1).head()
Out[16]:
In [17]:
wordcloud = WordCloud(
# width=1200, height=800,
margin=0,
background_color='white',
stopwords=stopwords,
max_words=200,
max_font_size=40,
random_state=42
).generate(str(emails_df[emails_df['sent_pos'] > 0.5].payload))
plt.rcParams['figure.dpi'] = 600 #72
plt.rcParams['figure.figsize'] = (10,8)
# plt.figure( figsize=(20,10) )
plt.imshow(wordcloud, interpolation='bilinear') #, interpolation='bilinear'
plt.axis('off')
plt.show()
In [18]:
emails_df[emails_df['sent_neg'] > 0.5].drop('message_obj', axis=1).head()
Out[18]:
In [19]:
wordcloud = WordCloud(
# width=1200, height=800,
margin=0,
background_color='white',
stopwords=stopwords,
max_words=200,
max_font_size=40,
random_state=42
).generate(str(emails_df[emails_df['sent_neg'] > 0.5].payload))
plt.rcParams['figure.dpi'] = 600 #72
plt.rcParams['figure.figsize'] = (10,8)
print(wordcloud)
plt.imshow(wordcloud, interpolation='bilinear') #, interpolation='bilinear'
plt.axis('off')
plt.show()
In [ ]:
In [20]:
from textblob import TextBlob
In [21]:
ss_lst = []
for i in tqdm_notebook(range(emails_df.shape[0])):
ss = TextBlob(emails_df.payload.iloc[i]).sentiment
ss_lst.append(ss)
emails_df['sent_obj'] = ss_lst
In [22]:
emails_df.head()
Out[22]:
In [23]:
emails_df['sent_polarity'] = emails_df.apply(lambda x: x.sent_obj.polarity, axis=1)
emails_df['sent_subjectivity'] = emails_df.apply(lambda x: x.sent_obj.subjectivity, axis=1)
emails_df.drop('sent_obj', axis=1, inplace=True)
In [24]:
emails_df.head()
Out[24]:
In [25]:
emails_df[emails_df['sent_polarity'] >= 0.7].drop('message_obj', axis=1).head()
Out[25]:
In [26]:
wordcloud = WordCloud(
# width=1200, height=800,
margin=0,
background_color='white',
stopwords=stopwords,
max_words=200,
max_font_size=40,
random_state=42
).generate(str(emails_df[emails_df['sent_polarity'] >= 0.7].payload))
plt.rcParams['figure.dpi'] = 600 #72
plt.rcParams['figure.figsize'] = (10,8)
print(wordcloud)
plt.imshow(wordcloud, interpolation='bilinear') #, interpolation='bilinear'
plt.axis('off')
plt.show()
In [27]:
emails_df[emails_df['sent_polarity'] < -0.8].drop('message_obj', axis=1).head()
Out[27]:
In [28]:
wordcloud = WordCloud(
# width=1200, height=800,
margin=0,
background_color='white',
stopwords=stopwords,
max_words=200,
max_font_size=40,
random_state=42
).generate(str(emails_df[emails_df['sent_polarity'] < -0.8].payload))
plt.rcParams['figure.dpi'] = 600 #72
plt.rcParams['figure.figsize'] = (10,8)
print(wordcloud)
plt.imshow(wordcloud, interpolation='bilinear') #, interpolation='bilinear'
plt.axis('off')
plt.show()
In [29]:
emails_df[emails_df['sent_subjectivity'] > 0.8].drop('message_obj', axis=1).head()
Out[29]:
In [30]:
wordcloud = WordCloud(
# width=1200, height=800,
margin=0,
background_color='white',
stopwords=stopwords,
max_words=200,
max_font_size=40,
random_state=42
).generate(str(emails_df[emails_df['sent_subjectivity'] > 0.8].payload))
plt.rcParams['figure.dpi'] = 600 #72
plt.rcParams['figure.figsize'] = (10,8)
print(wordcloud)
plt.imshow(wordcloud, interpolation='bilinear') #, interpolation='bilinear'
plt.axis('off')
plt.show()
In [ ]:
In [ ]: